{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beginning Data Analysis "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Introduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Developing a data analysis routine"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college = pd.read_csv('data/college.csv')\n",
    "college.sample(random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college.describe(include=[np.number]).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college.describe(include=[np.object, pd.Categorical]).T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How it works..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There's more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college.describe(include=[np.number],\n",
    "   percentiles=[.01, .05, .10, .25, .5,\n",
    "                .75, .9, .95, .99]).T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data dictionaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "pd.read_csv('data/college_data_dictionary.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reducing memory by changing data types"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college = pd.read_csv('data/college.csv')\n",
    "different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER',\n",
    "   'INSTNM', 'STABBR']\n",
    "col2 = college.loc[:, different_cols]\n",
    "col2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "col2.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "original_mem = col2.memory_usage(deep=True)\n",
    "original_mem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "col2.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college[different_cols].memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "col2.select_dtypes(include=['object']).nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "col2['STABBR'] = col2['STABBR'].astype('category')\n",
    "col2.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "new_mem = col2.memory_usage(deep=True)\n",
    "new_mem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "new_mem / original_mem"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How it works..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There's more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college.loc[0, 'CURROPER'] = 10000000\n",
    "college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'\n",
    "college[['CURROPER', 'INSTNM']].memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college['MENONLY'].dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college['MENONLY'].astype(np.int8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college.assign(MENONLY=college['MENONLY'].astype('float16'),\n",
    "    RELAFFIL=college['RELAFFIL'].astype('int8'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "college.index = pd.Int64Index(college.index)\n",
    "college.index.memory_usage() # previously was just 80"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Selecting the smallest of the largest"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "movie = pd.read_csv('data/movie.csv')\n",
    "movie2 = movie[['movie_title', 'imdb_score', 'budget']]\n",
    "movie2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "movie2.nlargest(100, 'imdb_score').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie2\n",
    "  .nlargest(100, 'imdb_score')\n",
    "  .nsmallest(5, 'budget')\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How it works..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### There's more..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Selecting the largest of each group by sorting"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "movie = pd.read_csv('data/movie.csv')\n",
    "movie[['movie_title', 'title_year', 'imdb_score']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie\n",
    "  [['movie_title', 'title_year', 'imdb_score']]\n",
    "  .sort_values('title_year', ascending=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie\n",
    "  [['movie_title', 'title_year', 'imdb_score']]\n",
    "  .sort_values(['title_year','imdb_score'],\n",
    "               ascending=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie\n",
    "  [['movie_title', 'title_year', 'imdb_score']]\n",
    "  .sort_values(['title_year','imdb_score'],\n",
    "               ascending=False)\n",
    "  .drop_duplicates(subset='title_year')\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How it works..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## There's more..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie\n",
    "  [['movie_title', 'title_year', 'imdb_score']]\n",
    "  .groupby('title_year', as_index=False)\n",
    "  .apply(lambda df: df.sort_values('imdb_score',\n",
    "         ascending=False).head(1))\n",
    "  .sort_values('title_year', ascending=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie\n",
    "  [['movie_title', 'title_year',\n",
    "    'content_rating', 'budget']]\n",
    "   .sort_values(['title_year',\n",
    "       'content_rating', 'budget'],\n",
    "       ascending=[False, False, True])\n",
    "   .drop_duplicates(subset=['title_year',\n",
    "        'content_rating'])\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Replicating nlargest with sort_values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "movie = pd.read_csv('data/movie.csv')\n",
    "(movie\n",
    "   [['movie_title', 'imdb_score', 'budget']]\n",
    "   .nlargest(100, 'imdb_score') \n",
    "   .nsmallest(5, 'budget')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie\n",
    "   [['movie_title', 'imdb_score', 'budget']]\n",
    "   .sort_values('imdb_score', ascending=False)\n",
    "   .head(100)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie\n",
    "   [['movie_title', 'imdb_score', 'budget']]\n",
    "   .sort_values('imdb_score', ascending=False)\n",
    "   .head(100) \n",
    "   .sort_values('budget')\n",
    "   .head(5)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How it works..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie\n",
    "   [['movie_title', 'imdb_score', 'budget']]\n",
    "   .nlargest(100, 'imdb_score')\n",
    "   .tail()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(movie\n",
    "   [['movie_title', 'imdb_score', 'budget']]\n",
    "   .sort_values('imdb_score', ascending=False) \n",
    "   .head(100)\n",
    "   .tail()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculating a trailing stop order price"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How to do it..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "import datetime\n",
    "import pandas_datareader.data as web\n",
    "import requests_cache\n",
    "session = requests_cache.CachedSession(\n",
    "   cache_name='cache', backend='sqlite', \n",
    "   expire_after=datetime.timedelta(days=90))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "tsla = web.DataReader('tsla', data_source='yahoo',\n",
    "   start='2017-1-1', session=session)\n",
    "tsla.head(8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "tsla_close = tsla['Close']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "tsla_cummax = tsla_close.cummax()\n",
    "tsla_cummax.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "lines_to_next_cell": 2
   },
   "outputs": [],
   "source": [
    "(tsla\n",
    "  ['Close']\n",
    "  .cummax()\n",
    "  .mul(.9)\n",
    "  .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How it works..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "lines_to_next_cell": 2
   },
   "source": [
    "### There's more..."
   ]
  }
 ],
 "metadata": {
  "jupytext": {
   "cell_metadata_filter": "-all",
   "main_language": "python",
   "notebook_metadata_filter": "-all"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
